'''
爬取BiliBili历史弹幕，并生成一个词云
'''
import requests
import re
import jieba
import wordcloud
from tqdm import tqdm
headers = {
    #'cookie': 'CURRENT_FNVAL=80; _uuid=02EC054C-0BFF-DA6D-6E2B-DB3EBE2123D155632infoc; blackside_state=1; rpdid=|(J~R)um|Jlm0J\'uY|Rl~YkmY; LIVE_BUVID=AUTO9316057585721440; _ga=GA1.2.1343203135.1606312958; buvid3=0AB0AA5E-FFCC-4C54-A17C-1841116FA2FE58482infoc; fingerprint=24bdededa5f6025a55884632ac743d4b; buvid_fp=9527B5AD-1F91-43A5-B682-CAFA27A2B9E458496infoc; buvid_fp_plain=3BA4AD7C-47BB-4CC9-B4B5-1B33C8317A0C143101infoc; SESSDATA=18f82d79%2C1638272250%2C2d9ba%2A61; bili_jct=e14a49a7adab27aef1f50db4656f6bba; DedeUserID=355152656; DedeUserID__ckMd5=0f28ae136984336c; sid=a0qop48s; CURRENT_QUALITY=80; bp_t_offset_355152656=538753525018509241; bp_video_offset_355152656=538766774994822471; bsource=search_bing; PVID=3',
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.62"
}
BV = input("请输入BV号：")
BV_url = "https://m.bilibili.com/video/" + BV
response1 = requests.get(BV_url, headers)
# 视频弹幕储存另一个url请求中，需要在视频url的脚本js中进行构造
js_str = response1.content.decode()
# 利用正则，从获取的数据中筛选出有用部分
data = re.findall(r'"cid":[\d]*', js_str)
# 截取第一个数据即为所需储存弹幕url的关键信息
data = data[0].replace('"cid":', "").replace(" ", "")
# 构造弹幕信息的url
url = "https://comment.bilibili.com/{}.xml".format(data)

#通过url获取到弹幕
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
#正则匹配弹幕
content_list = re.findall('<d p=".*?>(.*?)</d>', response.text)
#将弹幕保存
for index in tqdm(content_list):
    with open('弹幕1.txt', mode='a', encoding='utf-8') as f:
        f.write(index)
        f.write('\n')
f = open('弹幕1.txt', mode='r', encoding='utf-8')
DanMu = f.read()
DanMu_list = jieba.lcut(DanMu)
string = ' '.join(DanMu_list)
wc = wordcloud.WordCloud(
    width=1200,
    height=600,
    background_color='white',
    font_path='msyh.ttc',
)
wc.generate(string)
wc.to_file('弹幕1.png')